Application to Migration Flows

Thibault Laurent

2025-09-22

This document provides the R code used to reproduce the results presented in Chapter 3 of Thibault Laurent’s thesis.
While the theoretical framework is based on the original article, the empirical application has been adapted and extended in this chapter to focus on international migration flows.
The present supplementary material concentrates exclusively on the application, offering the scripts and comments needed to replicate the results. Note that the supplementary material of the original article is available here : https://github.com/tibo31/spatial_coda_elasticities

List of required packages:

devtools::install_github('rensa/ggflags')
source("codes/corrplot.R")
# Visualization and color palettes
library(colorspace)   # advanced color palettes
library(ggridges)     # ridge plots for distributions
library(ggstance)     # horizontal variants of ggplot geoms
library(ggflags)      # country flags for ggplot
library(tidytext)
library(forcats)

# Spatial data and econometrics
library(sf)           # handling spatial vector data
library(spdep)        # spatial econometrics and spatial weights

# Data manipulation and plotting
library(tidyverse)    # data wrangling and visualization framework

# Correlation visualization
library(corrplot)     # correlation matrix plots

# Correspondance between iso3 and country names
library(countrycode)

1 Data preparation

1.1 Migration flows

We use bilateral migration flows estimated with the optimal transport method, as introduced in Chapter 1 of the thesis.
For this application, the analysis is restricted to a single period (2020–2024) and covers 230 countries and territories.

To capture persistence in migration dynamics, we also include flows from the previous period.
Past migration patterns are known to influence current ones through established networks, lower informational and transaction costs, and the presence of diasporas in destination countries.
Incorporating this dynamic component helps provide a more comprehensive understanding of the structural mechanisms driving international migration.

my_years <- c(1990, 1995, 2000, 2005, 2010, 2015, 2020, 2024)
l <- 7
start_period <- my_years[l]
end_period <- my_years[l + 1]
period_of_interest <- paste0(start_period, "-", end_period) 
# our estimates
load("data/our_estimates_2024.RData")
pairs_od <- our_estimates_5y %>%
  filter(year == period_of_interest,
         type == "tot") %>%
  mutate(y = hat_transport_open_outwards + 
           hat_transport_open_returns + 
           hat_transport_open_transits) %>%
  select(origin, dest, y) %>%
  rename(cont_o = origin,
         cont_d = dest) 

pairs_od_2 <- our_estimates_5y %>%
  filter(year == paste0(my_years[l-1], "-", my_years[l]) ,
         type == "tot") %>%
  mutate(y = hat_transport_open_outwards + 
           hat_transport_open_returns + 
           hat_transport_open_transits) %>%
  select(origin, dest, y) %>%
  rename(cont_o = origin,
         cont_d = dest,
         y_1 = y) 

pairs_od <- merge(pairs_od, pairs_od_2, by = c("cont_o", "cont_d"))
row.names(pairs_od) <- paste0(pairs_od$cont_o,
                              "-", pairs_od$cont_d)
unique_iso3 <- unique(c(pairs_od$cont_o, pairs_od$cont_d))

1.2 Climate data

We rely on the climate indicators introduced in Chapter 2, constructed at the country level and aggregated over five-year periods.
For this application, the data are aggregated from the beginning of the observation window (2020) to its end (2024).

The indicators capture deviations from long-term climatic norms, the persistence of wet and dry conditions, and the frequency, duration, and intensity of extreme events.
Although we prepare the full set of climatic variables described in Chapter 2, in this application we focus on two key indicators: heatwave frequency (HWF) and dry spells (Dry).

load("data/final_indicator.RData")
final_indicator[, "t2m_diff"] <- apply(st_drop_geometry(final_indicator)[, paste0("t2m_diff_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "prec_diff"] <- apply(st_drop_geometry(final_indicator)[, paste0("prec_diff_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "ghwr_35"] <- apply(st_drop_geometry(final_indicator)[, paste0("ghwr_35_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "P5D"] <- apply(st_drop_geometry(final_indicator)[, paste0("prec_5days_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "dry"] <- apply(st_drop_geometry(final_indicator)[, paste0("dry_",start_period:(end_period - 1))], 1, mean)
final_indicator[, "wet"] <- apply(st_drop_geometry(final_indicator)[, paste0("wet_", start_period:(end_period - 1))], 1, mean)

final_indicator[, "wsdi"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_wsdi_upp_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "wsd"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_wsd_upp_", start_period:(end_period - 1))], 1, max)
final_indicator[, "wsei"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_wsei_upp_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "csdi"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_wsdi_low_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "csd"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_wsd_low_", start_period:(end_period - 1))], 1, max)
final_indicator[, "csei"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_wsei_low_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "hwf"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_hwf_upp_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "hwd"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_hwd_upp_", start_period:(end_period - 1))], 1, max)
final_indicator[, "hwei"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_hwei_upp_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "cwf"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_hwf_low_", start_period:(end_period - 1))], 1, mean)
final_indicator[, "cwd"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_hwd_low_", start_period:(end_period - 1))], 1, max)
final_indicator[, "cwei"] <- apply(st_drop_geometry(final_indicator)[, paste0("E_hwei_low_", start_period:(end_period - 1))], 1, mean)

nom_vars <- c("t2m_diff", "prec_diff", "ghwr_35", "P5D", "dry", "wet", 
              "wsdi", "wsd", "wsei", 
              "csdi", "csd", "csei", 
              "hwf", "hwd", "hwei",
              "cwf", "cwd", "cwei")

temp <- st_drop_geometry(
  final_indicator[final_indicator$iso3 %in% unique_iso3, ])

my_X <- temp[, c("iso3", nom_vars)]

1.3 Natural disaster data

In addition to conflict-related shocks, we also incorporate information on natural disasters using the EM-DAT database maintained by the Centre for Research on the Epidemiology of Disasters (CRED) (Guha-Sapir et al., 2015).
The EM-DAT database provides global, systematic, and standardized records of natural and technological disasters, including information on the type of event (e.g., floods, droughts, earthquakes, storms), its location, timing, and associated human and economic impacts.

For this application, we focus on the variable Total Affected, which aggregates the number of individuals injured, made homeless, or otherwise requiring immediate assistance following a disaster. This measure is particularly relevant because it captures the overall human impact of disasters, beyond fatalities alone, and therefore provides a better proxy for potential migration pressures.

By combining conflict and disaster data, we account for both violent and environmental shocks that may influence migration dynamics, either by directly displacing populations or by affecting long-term living conditions.

natural <- readxl::read_xlsx("data/public_emdat_custom_request_2025-03-04_0bfb4e4c-60c3-4bec-b9a7-e533bd2e3945.xlsx")
natural <- natural %>%
  filter(`Start Year` %in% start_period:(end_period - 1))
iso3_nat <-  natural$ISO

natural <- as.data.frame(natural[, 
    -c(1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, 16, 23, 24, 25, 26:31, 43:46)])

for(k in 1:ncol(natural)) {
  if(class(natural[[k]]) == "character")
    natural[[k]] <- factor(natural[[k]])
}

natural <- missForest::missForest(natural)
natural_imp <- natural$ximp

natural <- aggregate(natural_imp$`Total Affected`, 
                    by = list(ISO3 = iso3_nat),
                    FUN = sum, na.rm = T)
names(natural) <- c("iso3", "nat_dis")
save(natural, file = "data/natural.RData")
load("data/natural.RData")
natural$Lnat_dis <- log(1 + natural$nat_dis)
# merge natural
my_X <- merge(my_X, natural, by = "iso3", all.x = T)

1.4 Conflict data

Since some migration flows may be driven by conflict-related events, we incorporate data from the Armed Conflict Location & Event Data Project (ACLED), a widely used source on political violence and protest events (Raleigh et al., 2010).
From this dataset, we use the variable fatalities, which records the number of reported deaths associated with each conflict or protest event. We aggregate this variable at the country level over the observation period in order to capture the intensity of violence and its potential influence on migration flows.

conflict <- read.csv('data/1997-01-01-2024-06-30.csv')
conflict$event_date <- as.Date(conflict$event_date,
                               format = c("%d %B %Y"))
conflict <- conflict %>%
  filter(conflict$event_date > paste0(start_period, "-06-01") & 
           conflict$event_date < paste0(end_period, "-06-01"))
conflict$fatalities <- as.numeric(conflict$fatalities)
conflict_period <- aggregate(conflict[, c("fatalities")],
                             by = list(country = conflict$country),
                             FUN = sum, na.rm = T)

conflict_period$iso3 <- countrycode::countrycode(conflict_period$country, 
                                                  "country.name", "iso3c")
# categorized 
conflict_period$class_conflict <- findInterval(conflict_period$x, 
                        c(0, 50, 1000, 10000, 1000000), all.inside = TRUE)

# merge conflict 
my_X <- merge(my_X, conflict_period[, c("iso3", "class_conflict")],
              by = "iso3", all.x = T)
# fill missing
my_X[is.na(my_X$class_conflict), "class_conflict"] <- c(2, 1, 1, 1, 1, 1, 1)
# discretize the variable
my_X$class_conflict <- c(c("low", "medium", "high", "very high"))[my_X$class_conflict]

1.5 Political stability and human development

To further account for structural drivers of migration, we include measures of political stability and human development.
First, we use the Political Stability and Absence of Violence/Terrorism index from the World Development Indicators (WDI) published by the World Bank (World Bank, 2025). This indicator reflects perceptions of the likelihood that a government will be destabilized or overthrown by unconstitutional or violent means, including politically motivated violence and terrorism. It provides a relevant proxy for the institutional and security environment in origin and destination countries.

Second, we rely on the Human Development Index (HDI) as reported in the Human Development Report 2025 Statistical Annex (UNDP, 2025). The HDI is a composite measure that captures average achievements in three key dimensions of human development: health (life expectancy at birth), education (mean years of schooling and expected years of schooling), and standard of living (gross national income per capita). We use it as a broad measure of socio-economic conditions that may shape migration decisions and opportunities.

wb <- read.csv("data/6fa98486-1b47-4578-8979-58b9491e024d_Data.csv")

noms_var <- "politicalstability"
temp <- wb[wb$Series.Code == "PV.EST", -c(1, 2, 3)]
names(temp)[-1] <- c("1990-1995", "1995-2000", "2000-2005", "2005-2010",
                          "2010-2015", "2015-2020", "2020-2024")
political <- pivot_longer(temp, cols = 2:8, names_to = "period", 
                          values_to = noms_var) %>%
    filter(period == period_of_interest)
political[[noms_var]] <- as.numeric(political[[noms_var]] )
names(political)[1] <- "iso3"  

hdi <- readxl::read_xlsx("data/HDR25_Statistical_Annex_HDI_Table.xlsx",
                         skip = 7) %>%
  rename(num = 1, country = 2, HDI = 3) %>%
  filter(!is.na(num)) %>%
  select(2, 3) 
hdi$HDI <- as.numeric(hdi$HDI)
hdi$iso3 <- countrycode::countrycode(hdi$country, "country.name", "iso3c")
unique_iso3[!(unique_iso3 %in% hdi$iso3)]
##  [1] "ABW" "AIA" "ASM" "BMU" "COK" "CUW" "CYM" "ESH" "FLK" "FRO" "GIB" "GLP"
## [13] "GRL" "GUF" "GUM" "IMN" "MAC" "MCO" "MNP" "MSR" "MTQ" "MYT" "NCL" "NIU"
## [25] "PRI" "PRK" "PYF" "REU" "SHN" "SPM" "SXM" "TCA" "TKL" "TWN" "VGB" "VIR"
## [37] "WLF"
hdi_b <- read.csv("data/hdi_proxies_missing_un.csv") 
hdi_b <- hdi_b[, c("name", "hdi", "iso3")] 
names(hdi_b) <- c("country", "HDI", "iso3")
hdi <- rbind(hdi, hdi_b)
hdi <- hdi[, -1]

# merge political stability
my_X <- merge(my_X, political[, c("iso3", noms_var)], by = "iso3", all.x = T)
# Merge HDI
my_X <- merge(my_X, hdi, by = "iso3",  all.x = T)

1.6 Population and GDP

We use World Bank World Development Indicators (WDI) to retrieve data on GDP per capita (PPP), population, and unemployment for all countries and territories over 2020–2023. Missing or unreliable values are complemented with manual estimates and imputations (e.g., using parent economies for territories), in order to obtain a consistent dataset covering the 230 units in the migration analysis.

# install.packages(c("wbstats","dplyr","readr"))
library(wbstats)
library(dplyr)
library(readr)

# 1) Liste cible
iso3_list <- unique_iso3  # raccourci

# 2) Indicateurs à récupérer
indicators <- c(
  gdp_ppp      = "NY.GDP.PCAP.PP.CD",       # PPP courant
  population   = "SP.POP.TOTL",
  unemployment = "SL.UEM.TOTL.ZS",          # chômage (%)
  gdp_growth   = "NY.GDP.PCAP.KD.ZG"        # croissance PIB/hab
)

# 3) Télécharger les données (2020–2023)
wb_data_all <- wb_data(
  country     = iso3_list,
  indicator   = indicators,
  start_date  = start_period,
  end_date    = end_period - 1,
  return_wide = FALSE
)

# 4) Moyenne par pays
wb_summary <- wb_data_all %>%
  group_by(iso3c, indicator) %>%
  summarise(value_avg = mean(value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = indicator, values_from = value_avg) %>%
  rename(GDP =  `GDP per capita, PPP (current international $)`,
         pop = `Population, total`,
         unemp = `Unemployment, total (% of total labor force) (modeled ILO estimate)`) %>%
  select(iso3c, GDP, pop, unemp)

# -- Estimations manuelles (valeurs « raisonnables » 2020–2024, Intl$)
# Remplace/actualise facilement ces nombres si tu as de meilleures sources.
manual_estimates <- c(
  "LIE" = 139100,
  "MCO" = 115700,
  "SSD" = 1700,  # Sud-Soudan (conflit/famine -> très bas)
  "ERI" = 1900,  # Erythrée
  "YEM" = 2000,  # Yémen
  "PRK" = 1500,  # Corée du Nord (ordre de grandeur)
  "SOM" = 1450,  # Somalie
  "CUB" = 21000, # Cuba (ordre de grandeur PPP)
  "VEN" = 8400   # Venezuela (post-crise, PPP par hab.)
)

wb_summary[sapply(names(manual_estimates), function(x) grep(x, wb_summary$iso3c)), "GDP"] <- manual_estimates

manual_df <- tibble(
  iso3c = names(manual_estimates),
  GDP = as.numeric(manual_estimates),
  n_years = 0L
)

# -- Mapping territoires -> économie « parente » (imputation par copie)
parent_map <- c(
  "ASM" = "USA", # American Samoa -> USA
  "GIB" = "GBR", # Gibraltar -> Royaume-Uni
  "GUM" = "USA", # Guam -> USA
  "IMN" = "GBR", # Isle of Man -> Royaume-Uni (proche)
  "NCL" = "FRA", # Nouvelle-Calédonie -> France
  "MNP" = "USA", # Northern Mariana Islands -> USA
  "PYF" = "FRA", # Polynésie française -> France
  "VGB" = "GBR"  # British Virgin Islands -> Royaume-Uni
)

parent_df <- tibble(iso3c = names(parent_map),
                    parent_iso3 = unname(parent_map)) %>%
  left_join(wb_summary %>% select(parent_iso3 = iso3c,
                                parent_ppa = GDP),
            by = "parent_iso3") %>%
  transmute(iso3c,
            GDP = parent_ppa,
            n_years = -1L)  # code -1 pour signaler une imputation « parente »
wb_summary[sapply(parent_df$iso3c, function(x) grep(x, wb_summary$iso3c)), "GDP"] <- parent_df$GDP

# estimation du chomage 
manual_estimates <- c(
  "ABW" = 3.9,
  "AND" = 1.5,
  "ASM" = 5,
  "ATG" = 8,
  "BMU" = 4,
  "CUW" = 7,
  "CYM" = 3,
  "DMA" = 8,  
  "FRO" = 2,
  "FSM" = 11,
  "GIB" = 3,
  "GRD" = 21,
  "GRL" = 5,
  "IMN" = 0.7,
  "KIR" = 5,
  "KNA" = 25, 
  "LIE" = 1,
  "MCO" = 2,
  "MHL" = 6,
  "MNP" = 10,
  "NRU" = 10,
  "PLW" = 5,
  "SMR" = 4,
  "SXM" = 8,
  "SYC" = 6,
  "TCA" = 11,
  "TUV" = 1,
  "VGB" = 6.2)
wb_summary[sapply(names(manual_estimates), function(x) grep(x, wb_summary$iso3c)), "unemp"] <- manual_estimates

# j'ajoute les manquants 
aux_estimates <- tribble(
  ~iso3c, ~pop, ~GDP, ~unemp, 
  "AIA",  16e3, 22000,  5,  
  "COK",  17e3, 22000,  8,  
  "FLK", 3.6e3, 55000,  4,  
  "GUF", 300e3, 27000, 20,  
  "GLP", 380e3, 27000, 19,  
  "MTQ", 360e3, 27000, 17,  
  "MYT", 320e3, 10000, 25,  
  "MSR",   5e3, 13000,  6, 
  "NIU",  1.6e3, 17000, 12, 
  "REU", 870e3, 27000, 21,  
  "SHN",  5.6e3, 15000, 14, 
  "SPM",    6e3, 39000, 11, 
  "TKL",  1.5e3,  6000, 20, 
  "WLF",   11e3,  9000, 12, 
  "ESH", 600e3,  3000, 25, 
  "TWN", 23300000, 33000, 3.8
) %>%
  mutate(
    # Optionnel : arrondis pour éviter de fausses précisions
    pop = round(pop),
    GDP = round(GDP, 0),
    unemp = round(unemp, 1)
  )
wb_summary <- rbind(wb_summary, aux_estimates)
save(wb_summary, file = "data/GDP.RData")
load("data/GDP.RData")
wb_summary$LGDP <- log(wb_summary$GDP)
wb_summary$Lpop <- log(wb_summary$pop)
wb_summary$Lunemp <- log(wb_summary$unemp)
my_X <- merge(my_X, wb_summary, by.x = "iso3", by.y = "iso3c") 

1.7 Imputation of missing values

A few observations were missing for the HDI, political stability, and natural disaster variables.
To address this issue, we applied the Random Forest–based imputation algorithm implemented in the missForest package (Stekhoven & Bühlmann, 2012).
This method iteratively predicts missing entries using ensembles of decision trees and is well suited for mixed data types (continuous and categorical).

Compared to simpler approaches such as mean substitution or linear interpolation, random forest imputation preserves the multivariate structure of the data and reduces the risk of bias. This is particularly important in our setting, as the missingness is not systematic and the relationships between explanatory variables may be nonlinear.

numeric_vars <- sapply(my_X, is.numeric)
vars_with_missing <- sapply(my_X, function(x) any(is.na(x)))
# fill missing values
temp_imp <- missForest::missForest(my_X[, numeric_vars])$ximp
my_X[, names(which(vars_with_missing))] <- temp_imp[, names(which(vars_with_missing))]

1.8 Spatially lagged variables

We include spatially lagged variables to account for potential spillover effects across countries. To this end, we first construct a spatial weights matrix based on the 5 nearest neighbors of each country. In addition, we build an alternative matrix using the 10 nearest neighbors, which serves as a robustness check in the empirical analysis.

To construct these neighbors, we rely on the geographical database developed by James (2025), which provides harmonized measures of distances and contiguity for all recognized countries and territories. This dataset ensures consistency with international standards and is widely used in empirical studies of international trade and migration.

my_proj <- '+proj=robin +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'
source(file = "data/Load_Geospatial_Data.R")
sf_use_s2(F)
## Spherical geometry (s2) switched off
world_sf <- GeoDATA %>%
  rename(region = SIREGION, iso3 = ISO3)
countries_regions <- world_sf %>%
  rmapshaper::ms_simplify(, keep = 0.05, keep_shapes = T)
# polygons with same country than UN data basis
world <- st_transform(countries_regions, my_proj)
world_union <- st_union(world)

# contours, longitude and latitude of the map
p1 <- rbind(c(-180, -90), 
            c(180, -90),
            cbind(180, seq(-90, 90, by = 1)),
            c(180, 90),
            c(-180, 90),
            cbind(-180, seq(90, -90, by = -1)),
            c(-180, -90))
border_sf <- st_sfc(st_polygon(list(p1)))
long_sf <- st_sfc(st_linestring(cbind(-150, seq(-90, 90, by = 1))),
                  st_linestring(cbind(-120, seq(-90, 90, by = 1))),
                  st_linestring(cbind(-90, seq(-90, 90, by = 1))),
                  st_linestring(cbind(-60, seq(-90, 90, by = 1))),
                  st_linestring(cbind(-30, seq(-90, 90, by = 1))),
                  st_linestring(cbind(0, seq(-90, 90, by = 1))),
                  st_linestring(cbind(150, seq(-90, 90, by = 1))),
                  st_linestring(cbind(120, seq(-90, 90, by = 1))),
                  st_linestring(cbind(90, seq(-90, 90, by = 1))),
                  st_linestring(cbind(60, seq(-90, 90, by = 1))),
                  st_linestring(cbind(30, seq(-90, 90, by = 1)))
)
lat_sf <- st_sfc(st_linestring(cbind(seq(-180, 180, by = 1), -60)),
                  st_linestring(cbind(seq(-180, 180, by = 1), -30)),
                  st_linestring(cbind(seq(-180, 180, by = 1), -0)),
                  st_linestring(cbind(seq(-180, 180, by = 1), 30)),
                  st_linestring(cbind(seq(-180, 180, by = 1), 60))
)
# Coordinate Reference System
st_crs(border_sf) <- 4326
st_crs(long_sf) <- 4326
st_crs(lat_sf) <- 4326
border_sf <- st_transform(border_sf, my_proj)
long_sf <- st_transform(long_sf, my_proj)
lat_sf <- st_transform(lat_sf, my_proj)
sea <- st_difference(border_sf, world_union)

# create area
world_iso <- world[world$iso3 %in% unique(pairs_od$cont_o), ]
world_iso$SMALL_AREA <- ifelse(world_iso$AREA < 10000, "yes", "no")

Figure 3.6 in the chapter illustrates the spatial network linking the 230 countries and territories in our dataset, with each unit connected to its designated set of neighbors.

world_iso_3 <- merge(world_iso, my_X, by = "iso3")
world_iso_3_agg <- world_iso_3
world_iso_3_agg$iso3[world_iso_3_agg$iso3 == "ESH"] <- "MAR"
world_iso_3_agg <- aggregate(world_iso_3_agg[, "iso3"], 
          by = list(iso3 = world_iso_3_agg$iso3),
          FUN = length)
# fontiere Maroc / Sahara 
frontiers <- st_intersection(world_iso_3[world_iso_3$iso3 == "MAR", ], 
                world_iso_3[world_iso_3$iso3 == "ESH", ])
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
my_centroid <- st_as_sf(data.frame(
  long = world_iso_3$Centroid_of_LARGEST_POLYGON_X,
  lat = world_iso_3$Centroid_of_LARGEST_POLYGON_Y,
  iso3 = world_iso_3$iso3),
  coords = c("long", "lat"),
  crs = 4326)

# based on contiguity + 3 neighbours
nb_out <- union.nb(poly2nb(world_iso_3),
                   knn2nb(knearneigh(my_centroid, k = 3)))
# based on 10 neighbours
out <- knearneigh(my_centroid, k=10)
nb_out_10 <- knn2nb(out)

OW <- nb2mat(nb_out)

crs_robin <- st_crs(world_iso_3)
neighbours_sf <- st_as_sf(nb2lines(nb_out,
            coords =  st_coordinates(st_centroid(world_iso_3))))
st_crs(neighbours_sf) <- crs_robin
long_sf_180 <- st_sfc(st_linestring(cbind(-180, seq(-90, 90, by = 1))),
                  st_linestring(cbind(180, seq(-90, 90, by = 1))),
                  crs = 4326
)
long_sf_180 <- st_transform(long_sf_180, crs_robin)

ind_big <- which(as.numeric(st_length(neighbours_sf)) > 10^7)

#pdf("figures/spatial_weight.pdf", width = 10, height =5)
par(oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0))
plot(st_geometry(world_iso_3_agg), border = "lightgrey",
     ylim = c(-7*10^6, 7*10^6))
plot(st_geometry(frontiers), add = T, col = "lightgrey",
     lty = 3)
plot(st_geometry(sea), col = "lightblue", add = T)
plot(st_geometry(neighbours_sf[-ind_big, ]), col = "darkred",
     add =T, lwd = 0.8)

for(l in 1:length(ind_big)) {
   
# point 1
x_pt_1 <- neighbours_sf[ind_big[l], ]$geometry[[1]][1, 1]
y_pt_1 <- neighbours_sf[ind_big[l], ]$geometry[[1]][1, 2]
pt_1 <- st_sfc(st_point(c(x_pt_1, y_pt_1)), crs = crs_robin)
# point 2
x_pt_2 <- neighbours_sf[ind_big[l], ]$geometry[[1]][2, 1]
y_pt_2 <- neighbours_sf[ind_big[l], ]$geometry[[1]][2, 2]
pt_2 <- st_sfc(st_point(c(x_pt_2, y_pt_2)), crs = crs_robin)

seg1 <- st_nearest_points(pt_1, long_sf_180[which.min(as.numeric(st_distance(pt_1, long_sf_180))), ])  
seg2 <- st_nearest_points(pt_2, long_sf_180[which.min(as.numeric(st_distance(pt_2, long_sf_180))), ])  
plot(st_geometry(seg1), add = T, col = "darkred")
plot(st_geometry(seg2), add = T, col = "darkred")
}

#dev.off()

We construct spatially lagged versions of the \(OX/DX\) variables, capturing the influence of neighboring countries’ characteristics.

my_X <- st_drop_geometry(world_iso_3)

climate_var <- c("t2m_diff",  "prec_diff", "ghwr_35", "P5D",  
                 "dry", "wet",  "wsdi", "wsd",  "wsei",  "csdi",  
                 "csd", "csei", "hwf",  "hwd", "hwei", "cwf", "cwd", 
                 "cwei", "nat_dis", "Lnat_dis")
other_var <- c("politicalstability", "HDI", "LGDP", "Lpop", "Lunemp")

for(k in c(climate_var, other_var)) {
  my_X[, paste0("Lag_", k)] <- lag.listw(nb2listw(nb_out), my_X[, k])
}
save(my_X, file = "data/covariates.RData")

1.8.1 Linguistic and colonial attributes

To characterize bilateral migration flows, we use dyadic variables from CEPII datasets.
The GeoDist database provides information on languages and colonial ties (Mayer & Zignago, 2011).
The Gravity dataset complements this by including a broader set of bilateral covariates, such as cultural, colonial, linguistic, and geographical links (Head, Mayer & Ries, 2022).

These attributes help capture historical and cultural relationships between origin and destination countries.

geo <- readxl::read_xls("data/geo_cepii.xls",
                        sheet = "geo_cepii", skip = 0)
cepii2 <- read.csv("data/geo_cepii_gapfilled_subset.csv")
names(cepii2)[c(3, 13)] <- c("cnum", "lon")
geo <- rbind(geo, cepii2)

names_lang <- c("langoff_1", "langoff_2", "langoff_3", "lang20_1", "lang20_2", "lang20_3", "lang20_4", "lang9_1", "lang9_2", "lang9_3", "lang9_4")
names_colony <- c("colonizer1", "colonizer2", "colonizer3", "colonizer4", "short_colonizer1",
                  "short_colonizer2", "short_colonizer3")
geo_df <- data.frame(unique(geo[, c("iso3", names_lang, names_colony)]), row.names = "iso3")
pairs_od[, "comlang"] <- 0
pairs_od[, "colony"] <- 0
for(l in 1:nrow(pairs_od)) {
  pairs_od[l, "comlang"] <- length(setdiff(intersect(
    as.character(geo_df[pairs_od$cont_o[l], names_lang]),
    as.character(geo_df[pairs_od$cont_d[l], names_lang])), "."))
  pairs_od[l, "colony"] <- length(intersect(
    as.character(row.names(geo_df[pairs_od$cont_o[l], ])),
    as.character(geo_df[pairs_od$cont_d[l], names_colony])))
}
pairs_od[, "comlang"] <- factor(ifelse(pairs_od[, "comlang"] > 0, "yes", "no"),
                                 levels = c("yes", "no"))
pairs_od[, "colony"] <- factor(ifelse(pairs_od[, "colony"] > 0, "yes", "no"),
                                 levels = c("yes", "no"))

1.8.2 Geographical proximity: distances and contiguity

We include two measures of geographical proximity between countries:

  • Minimum distance: computed with the st_distance() function from the sf package, as the shortest distance between national borders. For neighbors, this distance is zero, which better reflects spatial separation than capital-to-capital distances.

  • Contiguity: a binary variable equal to 1 if two countries share a border, 0 otherwise.

Both measures are derived from the harmonized geographical database of James (2025), widely used in trade and migration studies.

# create distance
n <- nrow(world_iso_3)
n_d <- n_o <- n  
names_o <- names_d <- world_iso_3$iso3
g_2 <- st_distance(world_iso_3)
g_2 <- st_distance(st_centroid(world_iso_3, of_largest_polygon = T))
dimnames(g_2) <- list(names_o, names_d)
g_2 <- as.vector(t(g_2)) / 1000
# g_2 <- ifelse(g_2 != 0, log(g_2), 0)  
names(g_2) <- paste0(rep(names_o, n_d), "-", rep(names_d, each = n_o))
pairs_od$g <- 0
pairs_od[names(g_2), "g"] <- g_2
# plot(log(1 + y) ~ log(1 + g), data = pairs_od)
# abline(lm(log(1 + y) ~ log(1 + g), data = pairs_od), col = "red")

## contiguity variable
ctg <- nb2mat(poly2nb(world_iso_3), zero.policy = TRUE, style = "B") 
dimnames(ctg) <- list(
  world_iso_3$iso3, 
  world_iso_3$iso3)
pairs_od$contig <- 0
for(l in 1:nrow(pairs_od)) {
  pairs_od[l, "contig"] <- ctg[pairs_od[l, "cont_o"], pairs_od[l, "cont_d"]]
}
pairs_od$contig <- ifelse(pairs_od$contig > 0, "yes", "no")

1.9 Merging datasets

We then construct the final dataset by combining all relevant dimensions.
It brings together:

  • Origin-specific variables (population, GDP, political stability, climate indicators),
  • Destination-specific variables (same indicators at the receiving country), and
  • Pairwise variables (geographical distance, contiguity, common language, colonial links).

The resulting dataset integrates all covariates needed for the empirical analysis of bilateral migration flows.

my_X <- data.frame(my_X, row.names = "iso3") 

for(k in colnames(my_X)) {
  pairs_od[, paste0(k, "_O")] <- my_X[pairs_od$cont_o, k]
  pairs_od[, paste0(k, "_D")] <- my_X[pairs_od$cont_d, k]
}

pairs_od$IncomeLevel_O <-  factor(pairs_od$IncomeLevel_O, 
              levels = c("LOW", "LOWER-MIDDLE", "UPPER-MIDDLE", "HIGH"))
pairs_od$IncomeLevel_D <- factor(pairs_od$IncomeLevel_D, 
              levels = c("LOW", "LOWER-MIDDLE", "UPPER-MIDDLE", "HIGH"))
pairs_od$class_conflict_O <- factor(pairs_od$class_conflict_O, 
              levels = c("low", "medium", "high", "very high"))
pairs_od$class_conflict_D <- factor(pairs_od$class_conflict_D, 
              levels = c("low", "medium", "high", "very high"))
pairs_od$comlang <- factor(pairs_od$comlang, levels = c("no", "yes"))
pairs_od$colony <- factor(pairs_od$colony, levels = c("no", "yes"))
pairs_od$contig <- factor(pairs_od$contig, levels = c("no", "yes"))
pairs_od$SMALL_AREA_O <- factor(pairs_od$SMALL_AREA_O, levels = c("no", "yes"))
pairs_od$SMALL_AREA_D <- factor(pairs_od$SMALL_AREA_O, levels = c("no", "yes"))

1.10 Summary statistics

This section provides the R code used to produce the supplementary material presented in Annex C:

Table C.1:

pairs_od_without_intra <- pairs_od  %>%
  filter(y > 0) #cont_o != cont_d) #y > 0) # %>%   filter(CONTINENT_O == "ASIA")

sapply(my_X[, c("hwf", "dry", "politicalstability", 
             "HDI", "Lpop")], function(x) 
      paste0(round(mean(x), 2), " & ", round(sd(x), 2), " & ", 
              round(min(x), 2), " & ",  round(max(x), 2)))
##                             hwf                             dry 
##    "39.37 & 35.05 & 0 & 175.37" "50.91 & 47.33 & 5.98 & 270.87" 
##              politicalstability                             HDI 
##    "0.06 & 0.99 & -2.78 & 1.66"     "0.76 & 0.15 & 0.39 & 0.98" 
##                            Lpop 
##   "14.98 & 2.72 & 7.31 & 21.07"
sapply(pairs_od_without_intra[, c("y_1", "g")], function(x) 
      paste0(round(mean(log(1 + x)), 2), " & ", round(sd(log(1 + x)), 2), " & ", 
              round(min(log(1 + x)), 2), " & ",  round(max(log(1 + x)), 2)))
##                          y_1                            g 
##     "2.26 & 2.8 & 0 & 14.42" "8.77 & 0.85 & 3.01 & 10.41"
sapply(pairs_od_without_intra[, c("comlang", "colony", "contig")], table)
##     comlang colony contig
## no    34936  44001  43626
## yes    9328    263    638
table(my_X$class_conflict)
## 
##      high       low    medium very high 
##        22       150        43        15

Figures C.1–C.3.:

### OX/DX variables 
names_xvar <- c("politicalstability", "HDI", "Lpop", "class_conflict")
names_title <- c("Political stability index", "Human Development Index",
                 "Log of population", "Conflict")
# at origin 
p_list <- vector("list", 4)
for(k in 1:length(names_xvar)) {
  if(k != 4)
    p_list[[k]] <- ggplot(data = pairs_od_without_intra) +
      aes(x = !!sym(paste0(names_xvar[k], "_O")), y = log(1 + y)) +
      geom_point(size = 0.5) +
      geom_smooth(method = "lm") +
      xlab(paste0(names_title[k], " (at origin)"))
  else
    p_list[[k]] <- ggplot(data = pairs_od_without_intra) +
      aes(x = !!sym(paste0(names_xvar[k], "_O")), y = log(1 + y)) +
      geom_boxplot(size = 0.5)  +
      xlab(paste0(names_title[k], " (at origin)"))
}  

p <- gridExtra::grid.arrange(p_list[[1]], p_list[[2]], 
                             p_list[[3]], p_list[[4]], ncol = 4)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

p
## TableGrob (1 x 4) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (1-1,4-4) arrange gtable[layout]
#ggsave(p, filename = paste0("figures/OD_scatter_O.pdf"), 
#                            width = 16, height = 4)


# at destination 
p_list <- vector("list", 4)
for(k in 1:length(names_xvar)) {
  if(k != 4)
    p_list[[k]] <- ggplot(data = pairs_od_without_intra) +
      aes(x = !!sym(paste0(names_xvar[k], "_D")), y = log(1 + y)) +
      geom_point(size = 0.5) +
      geom_smooth(method = "lm") +
      xlab(paste0(names_title[k], " (at destination)"))
  else
    p_list[[k]] <- ggplot(data = pairs_od_without_intra) +
      aes(x = !!sym(paste0(names_xvar[k], "_D")), y = log(1 + y)) +
      geom_boxplot(size = 0.5)  +
      xlab(paste0(names_title[k], " (at destination)"))
}  

p <- gridExtra::grid.arrange(p_list[[1]], p_list[[2]], p_list[[3]], p_list[[4]], ncol = 4)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

p
## TableGrob (1 x 4) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (1-1,4-4) arrange gtable[layout]
#ggsave(p, filename = paste0("figures/OD_scatter_D.pdf"), 
#                            width = 16, height = 4)

#############
### OX/DX variables 
p4 <- ggplot(data = pairs_od_without_intra) +
  aes(x = log(1+y_1), y = log(1 + y)) +
  geom_point(size = 0.5)  +
  geom_smooth(method = "lm") +
  xlab("Log(1 + past flows)")

p5 <- ggplot(data = pairs_od_without_intra) +
  aes(x = log(1 + g), y = log(1 + y)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "lm") +
  xlab("Log(1 + distance)")

p1 <- ggplot(data = pairs_od_without_intra) +
  aes(x = colony, y = log(1 + y)) +
  geom_boxplot(size = 0.5)  +
  xlab("Shared colonial history")

p2 <- ggplot(data = pairs_od_without_intra) +
  aes(x = contig, y = log(1 + y)) +
  geom_boxplot(size = 0.5)  +
  xlab("Contiguity")

p3 <- ggplot(data = pairs_od_without_intra) +
  aes(x = comlang, y = log(1 + y)) +
  geom_boxplot(size = 0.5)  +
  xlab("Common language")

p <- gridExtra::grid.arrange(p1, p2, p3, p4, p5, ncol = 3)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

ggsave(p, filename = paste0("figures/pairs.pdf"), 
                            width = 12, height = 8)

# climate
names_xvar <- c("hwf", "dry")
names_title <- c("Heat wave frequency",
                 "Consecutive dry days")
for(k in 1:length(names_xvar)) {
p1 <- ggplot(data = pairs_od_without_intra) +
  aes(x = !!sym(paste0(names_xvar[k], "_O")), y = log(1 + y)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "lm") +
  xlab(paste0(names_title[k], " (at origin)"))
p2 <- ggplot(data = pairs_od_without_intra) +
  aes(x = !!sym(paste0("Lag_", names_xvar[k], "_O")), y = log(1 + y)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "lm") +
  xlab(paste0(names_title[k], " (lagged at origin)"))
p3 <- ggplot(data = pairs_od_without_intra) +
  aes(x = !!sym(paste0(names_xvar[k], "_D")), y = log(1 + y)) +
  geom_point(size = 0.5)  +
  geom_smooth(method = "lm") +
  xlab(paste0(names_title[k], " (at destination)"))
p4 <- ggplot(data = pairs_od_without_intra) +
  aes(x = !!sym(paste0("Lag_", names_xvar[k], "_D")), y = log(1 + y)) +
  geom_point(size = 0.5)  +
  geom_smooth(method = "lm") +
  xlab(paste0(names_title[k], " (lagged at destination)"))
p <- gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 4)
# ggsave(p, filename = paste0("figures/", names_xvar[k], ".pdf"), 
#                            width = 20, height = 5)
p
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

We plot the correlation plot presented in Figure C.4.

X <- pairs_od_without_intra %>% 
  mutate(log_y = log(1+y),
         log_distance = log(1+g),
         log_past_y = log(1+y_1)) %>%
  rename(political_O = politicalstability_O,
         political_D = politicalstability_D,
         log_pop_O = Lpop_O,
         log_pop_D = Lpop_D
         )

# --- 2) Define families and desired order -----------------------------------
fam_climate  <- c("hwf_O","hwf_D","dry_O","dry_D")
fam_inst     <- c("political_O","political_D","HDI_O","HDI_D")
fam_demo     <- c("log_pop_O","log_pop_D", "log_past_y")
fam_dyadic   <- c("log_distance")

# Put Y first so its row/col are easy to see
var_order <- c("log_y", fam_climate, fam_inst, fam_demo, fam_dyadic)

# Keep only available columns, in that order
X <- X[, var_order]

# --- 3) Correlation matrix ---------------------------------------------------
# Pearson by default; use method = "spearman" if you prefer rank correlations.
R <- cor(X, use = "pairwise.complete.obs", method = "pearson")

# --- 4) Plot with corrplot (ellipses, diverging colors) ----------------------
# Color palette
pal <- colorRampPalette(c("#2166AC","#67A9CF","#D1E5F0","#F7F7F7",
                          "#FDDAD1","#EF8A62","#B2182B"))

# Make Y label stand out (black), others grey
tl_cols <- c("black", rep("#1B9E77", 4),
              rep("#D95F02", 4),
              rep("#7570B3", 3),
              rep("#E7298A", 1))

# Open plotting device
# pdf("figures/corr.pdf", width = 7, height =6)
op <- par(no.readonly = TRUE); on.exit(par(op))
corrplot(R,
  method = "ellipse",
  type = "upper",
  diag = FALSE,
  col = pal(200),
  tl.col = tl_cols,
  tl.srt = 45,
  tl.cex = 0.8,
  addgrid.col = "grey90",
  mar = c(0,0,.2,0),
  number.cex = 0.7,
  # uncomment next line to print coefficients on the ellipses
  # addCoef.col = "black",
  cl.ratio = 0.2, cl.align.text = "l"
)


# --- 5) Emphasize Y's row/column --------------------------------------------
p <- ncol(R)
iy <- match("log_y", colnames(R))
# Outline the Y row/column
rect(xleft = 1.5, ybottom = p - iy + 1.5, xright = p + 0.5, ytop = p - iy + 0.5,
       border = "black", lwd = 2)
segments(x0 = 5.5, x1 = p + 0.5, y0 = 8.5, y1 = 8.5, lwd = 2, lty = 2)
segments(x0 = 9.5, x1 = p + 0.5, y0 = 4.5, y1 = 4.5, lwd = 2, lty = 2)

segments(x0 = 5.5, x1 = 5.5, y0 = 12.5, y1 = 13.5, lwd = 2, lty = 2)
segments(x0 = 9.5, x1 = 9.5, y0 = 12.5, y1 = 13.5, lwd = 2, lty = 2)
segments(x0 = 12.5, x1 = 12.5, y0 = 12.5, y1 = 13.5, lwd = 2, lty = 2)

#dev.off()

1.11 Visualization

Here we display the covariates on the same maps used for the migration flows, allowing a direct visual comparison between explanatory factors and observed movements.

source("codes/plot_flows_unique.R")
title_var <- c("Difference from \n average T2M temp. (in degree C)", 
               "Difference from \n average precipitation (in mm)",
               "GWHR",
               "Maximum consecutive \n 5-days precipitation (in mm)",
               "Total Count of Dry Days", 
               "Total Count of Wet Days",
               "Annual count of days \n satisfying the warm spell risk",
               "Length of the longest \n warm spell risk",
               "Daily maximum \n temperature excess (upp) risk",
               "Annual count of days \n satisfying the heat wave risk",
               "Length of the longest \n heat wave risk",
               "Daily mean temperature \n excess (upp) risk", 
               "Annual count of days \n satisfying the cold spell risk",
               "Length of the longest \n cold spell risk",
               "Daily maximum temperature \n excess (low) risk",
               "Annual count of days \n satisfying the warm wave risk",
               "Length of the longest \n warm wave risk",
               "Daily mean \n temperature excess (low) risk")

1.12 By indicator

world_3035 <- st_transform(world, 3035) 
sea_3035 <- st_transform(sea, 3035)
world_union_3035 <- st_transform(world_union, 3035)
year_k <- "2020-2024"
temp_k <- all_estimates %>% 
    filter(type == "tot", year == year_k)
# we keep optimal transport estimate 
mig_flow <- temp_k[, c("origin", "dest", "hat_transport_open")]  %>%
    filter(hat_transport_open > 2500) 
names(mig_flow)[3] <- "y"
y <- mig_flow$y
index_o <- mig_flow$origin
index_d <- mig_flow$dest

##
world_crs <- st_transform(world_iso_3, 3035)
world_iso_3_agg_3035 <- st_transform(world_iso_3_agg, 3035)
frontiers_3035 <- st_transform(frontiers, 3035)
countries_ct <- unique(c(index_o, index_d))
nom_vars <- c("hwf", "dry", "politicalstability", "HDI", "pop", "class_conflict", "nat_dis")
title_var <- c("Annual count of days \n satisfying the heat wave risk \n ",
               "Consecutive dry days",
               "Political stability",
               "HDI index",
               "Population",
               "Conflict",
               "Natural disaster"
               )
for(l in 1:length(nom_vars)) {
  
  # vec_temp <- st_drop_geometry(final_indicator)[, nom_vars[l]]
  vec_temp <- st_drop_geometry(world_crs)[, nom_vars[l]]
  if(nom_vars[l] %in% c("politicalstability", "t2m_diff", 
                        "prec_diff")) {
    my_mid <- ifelse(nom_vars[l] == "HDI", 0.5, 0)
    bk_1 <- round(classInt::classIntervals(vec_temp[vec_temp < my_mid], 4,
                                           "kmeans")$brks, 
                  digits = 6)
    bk_2 <- round(classInt::classIntervals(vec_temp[vec_temp >= my_mid], 4,
                                           "kmeans")$brks, 
                  digits = 6)
    bk <- c(bk_1[1:4], my_mid, bk_2[2:5]) 
    bk[1] <- min(vec_temp)
    bk[length(bk)] <- max(vec_temp)
    pal1 <- rev(RColorBrewer::brewer.pal(8, "RdBu"))   
    if(nom_vars[l] == "politicalstability")
      pal1 <- rev(pal1)  
    } else { 
      if(class(vec_temp) == "numeric") {
        bk <- round(classInt::classIntervals(vec_temp, 8, "kmeans")$brks, 
            digits = 6)
        bk[1] <- min(vec_temp)
        bk[length(bk)] <- max(vec_temp)
        pal1 <- RColorBrewer::brewer.pal(8, ifelse(nom_vars[l] %in% 
                            c("wet", "P5D"), "Blues", "Reds"))
      }
    }
  if(nom_vars[l] == "HDI")
      pal1 <- RColorBrewer::brewer.pal(8, "Spectral")
  
  if(nom_vars[l] == "class_conflict") {
      bk <- 1:5
      pal1 <- RColorBrewer::brewer.pal(4, "Reds")
      ind <- as.numeric(factor(vec_temp, 
        levels = c("low", "medium", "high", "very high")))
      
  } else {
  ind <- findInterval(vec_temp, bk, all.inside = TRUE)
  }
#  pdf(paste0("figures/world_", nom_vars[l], ".pdf"), width = 12, height = 7.7)  # width = 12, height = 6.7)  
      par(mar = c(0, 0, 1, 0))
      plot(st_geometry(world_crs), col = pal1[ind], 
         border = pal1[ind], pch = 15, cex = 0.5, lwd = 0.1,
         ylim = c(-0.6 * 10^7, 10^7))
      plot(st_geometry(world_iso_3_agg_3035), border = "lightgrey",  
           lwd = 0.1, add = T)
      plot(st_geometry(frontiers_3035), add = T, col = "lightgrey", 
           lwd = 0.3, lty = 2)

      # title("Consecutive Dry Day in 2024", line = -1.15)
      #plot(sea_3035, col = scales::alpha("lightblue", 0.3), 
      #     border = rgb(0.2, 0.2, 0.2), add = T)
      plot(st_geometry(world_union_3035), add = T, border = rgb(0.2, 0.2, 0.2))
      #plot(long_sf, add = T, col = "lightgrey")
      #plot(lat_sf, add = T, col = "lightgrey")

## parameters for legend
    poly_leg_temp <- vector("list", length(bk) - 1)
    for(k in 1:(length(bk) - 1)) {
      poly_leg_temp[[k]] <- rbind(c(14000000, -6300000 + (k - 1) * 750000), 
         c(14000000 + 1000000, -6300000 + (k - 1) * 750000),
         c(14000000 + 1000000, -6000000 + (k - 1) * 750000),
         c(14000000, -6000000 + (k - 1) * 750000),
         c(14000000, -6300000 + (k - 1) * 750000))
      if(nom_vars[l] == "class_conflict")
        poly_leg_temp[[k]][, 2] <- poly_leg_temp[[k]][, 2] + 2800000
    }
    poly_leg <- st_sfc(st_polygon(poly_leg_temp[1]))
    for(k in 2:length(poly_leg_temp)) {
      poly_leg <- c(poly_leg, st_sfc(st_polygon(poly_leg_temp[k])))
    }
    # legend
    plot(poly_leg, col = pal1, add = T)
    temp_coord <- st_coordinates(st_centroid(poly_leg))
    if(nom_vars[l] != "class_conflict") {
      if(nom_vars[l] %in% c("pop", "nat_dis"))
        bk_round <- paste0(round(bk/1000000), "M")
      else
        bk_round <- round(bk, 2)
      for(k in 1:length(poly_leg)) {
        if(k == 1) {
          my_text <- paste0("<", bk_round[2])
          } else {
            if(k == (length(bk) - 1)) {
              my_text <- paste0(">=", bk_round[length(bk) - 1])
              } else {
                my_text <- paste0("[", bk_round[k], "; ", bk_round[k+1], "[")
              }
          }
        text(temp_coord[k, 1], temp_coord[k, 2] + 10, my_text, 
             pos = 3, cex = 0.7)
      }
    } else {
      my_text <- c("low", "medium", "high", "very high")
      text(temp_coord[, 1], temp_coord[, 2] + 10, my_text, pos = 3, cex = 0.7) 
    }
    text(1.5 * 10^7, -0.01 * 10^7, title_var[l], cex = 0.8)

# plot(st_geometry(world_union), border = rgb(0.4, 0.4, 0.4), add = T)
plot_flows_unique(
    y, # value of the flow
    index_o = as.character(index_o), # label of the origin
    index_d = as.character(index_d),  # label of the destination
    geo_site = world_crs,  # the geographic coordinates of the sites: polygon or points of class sf
    column_name = "iso3", # the column name of the regions in geo_site sf object and/or in xy_sf
    add = T,   # if add = T, the arcs and only the arcs will printed on the existing device
    type = "barchart", # if T plot barplot for outloflows/inflows and intra flows
    xy_sf = st_centroid(world_crs, of_largest_polygon=TRUE), # possibility to give different coordinates of the starting and ending flows (by default, it corresponds to the centroid of geo_site)
    select_o = NULL, # the origin sites to print
    select_d = NULL, # the destination site to print
    select_bar = countries_ct, # the sites to print the barplot
    select_flows =NULL, # a vector of logical of size N indicating the flows to be printed
    col_site = NULL, # a vector of colors with attribute names equal to the name of the sites
    transparency = list(alpha = c(0.05, 0.6, 1), 
                      range = c(0, 100000, max(y))), # two scalars to define the values of the minimum and maximum transparency
    width_arc = c(0.2, max(y)), # a flow equals to max(flows) is represented by an arc of width 1 
    size_head_arc = 1, # the size of the ending and starting of the arrows
    reduce_arc = 0.9, # the percentage of reduction of the flow
    percent_arc = 0.1, # the percentage of flows to represent
    width_bar = 0.7, # the width of the barplot
    lwd_bar = 0.1,   # the lwd of the bar
    percent_bar = 1, # the percentage of barplot to print
    col_direction = "none", # a character among "outflows", "inflows", "none"; by default, the outflows will be plotted in color
    col_geometry = "lightgrey",  # color inside the polygons
    border_geometry = "white", # color of the contours 
    lwd_geometry = 0.5, # width of the contours,
    print_names = F, # print the names of the sites on the map
    size_names = 1, # size of the names if printed
    print_legend = T, # legend for the width of the flows
    pos_legend = "bottomright", # position of the legend
    values_legend = c(100000, 200000, 500000), # values of the flows to be printed on the legend
    size_legend = 0.7, # size of the characters in the legend
    horiz_legend = F, # by default vertical
    title_legend = "Flows", # the name of the legend
    digit_legend = 2, # number of digits to keep in legend
    bezier = F
    )
        
 #dev.off()
}
## Warning: st_centroid assumes attributes are constant over geometries

## Warning: st_centroid assumes attributes are constant over geometries

## Warning: st_centroid assumes attributes are constant over geometries

## Warning: st_centroid assumes attributes are constant over geometries

## Warning: st_centroid assumes attributes are constant over geometries

## Warning: st_centroid assumes attributes are constant over geometries

## Warning: st_centroid assumes attributes are constant over geometries

2 Model estimation and impact decomposition

We now illustrate the estimation of both SLX and SDM models.

2.1 SLX

The SLX (Spatial Lag of X) model is estimated using a standard ordinary least squares (OLS) regression.
Spatial dependence is incorporated by explicitly adding spatially lagged explanatory variables among the covariates.
In practice, this can be implemented with the lm() function in R, once the lagged variables have been constructed.
This approach allows us to capture the influence of neighboring conditions without modeling feedback effects through the dependent variable.

vars_included <- c("hwf", "dry")
#vars_included <- "hwf" 

var_controls <-  c("Lpop_O", "Lpop_D", "HDI_O", "HDI_D",  # LGDP_O + LGDP_D +                         "Lunemp_O", "Lunemp_D", 
                   "politicalstability_O", "politicalstability_D",
                   "class_conflict_O", "class_conflict_D", 
                   "log(1 + y_1)", "log(1 + g)", "contig", "comlang", "colony") 
var_controls_2 <-  c("Lpop_O", "Lpop_D", "HDI_O", "HDI_D",  # LGDP_O + LGDP_D +                         "Lunemp_O", "Lunemp_D", 
                   "politicalstability_O", "politicalstability_D",
                   "class_conflict_O", "class_conflict_D", 
                   "log(1 + g)", "contig", "comlang", "colony") 

var_climate <- paste0(c("", "", "Lag_", "Lag_"),
                      rep(vars_included, each = 4),
                      c("_O", "_D", "_O", "_D"))

form_str <- paste("log(1 + y) ~",
                  paste(c(var_climate, var_controls), collapse = " + "))
form_str_2 <- paste("log(1 + y) ~",
                  paste(c(var_climate, var_controls_2), collapse = " + "))

my_formula <- as.formula(form_str)
my_formula_2 <- as.formula(form_str_2)

Result for specification 1:

spec_1 <- lm(my_formula_2, data = pairs_od_without_intra)
sum_spec_1 <- summary(spec_1)
sum_spec_1
## 
## Call:
## lm(formula = my_formula_2, data = pairs_od_without_intra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4159 -1.2268 -0.2385  0.9762  9.0263 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.100e+01  1.604e-01 -68.567  < 2e-16 ***
## hwf_O                     -1.873e-03  4.421e-04  -4.237 2.27e-05 ***
## hwf_D                     -2.218e-03  4.421e-04  -5.016 5.30e-07 ***
## Lag_hwf_O                  4.205e-04  5.225e-04   0.805 0.420943    
## Lag_hwf_D                 -1.227e-03  5.223e-04  -2.350 0.018793 *  
## dry_O                      2.037e-03  3.881e-04   5.248 1.54e-07 ***
## dry_D                      2.861e-03  3.881e-04   7.372 1.71e-13 ***
## Lag_dry_O                 -2.387e-03  4.605e-04  -5.185 2.17e-07 ***
## Lag_dry_D                 -3.085e-03  4.604e-04  -6.701 2.10e-11 ***
## Lpop_O                     4.422e-01  4.665e-03  94.782  < 2e-16 ***
## Lpop_D                     4.482e-01  4.649e-03  96.417  < 2e-16 ***
## HDI_O                      4.736e+00  8.224e-02  57.585  < 2e-16 ***
## HDI_D                      5.202e+00  8.208e-02  63.378  < 2e-16 ***
## politicalstability_O       1.722e-01  1.992e-02   8.644  < 2e-16 ***
## politicalstability_D       2.503e-01  1.992e-02  12.561  < 2e-16 ***
## class_conflict_Omedium    -9.964e-02  2.800e-02  -3.558 0.000374 ***
## class_conflict_Ohigh       4.105e-02  4.075e-02   1.007 0.313819    
## class_conflict_Overy high  8.300e-01  5.423e-02  15.306  < 2e-16 ***
## class_conflict_Dmedium    -1.573e-01  2.799e-02  -5.620 1.92e-08 ***
## class_conflict_Dhigh       6.855e-03  4.074e-02   0.168 0.866400    
## class_conflict_Dvery high  8.439e-01  5.422e-02  15.565  < 2e-16 ***
## log(1 + g)                -9.189e-01  1.102e-02 -83.409  < 2e-16 ***
## contigyes                  3.307e+00  7.628e-02  43.349  < 2e-16 ***
## comlangyes                 1.390e+00  2.255e-02  61.620  < 2e-16 ***
## colonyyes                  2.223e+00  1.132e-01  19.637  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.798 on 44239 degrees of freedom
## Multiple R-squared:  0.5805, Adjusted R-squared:  0.5803 
## F-statistic:  2551 on 24 and 44239 DF,  p-value: < 2.2e-16

Result for specification 2:

spec_2 <- lm(my_formula, data = pairs_od_without_intra)
sum_reg <- summary(spec_2)
sum_reg
## 
## Call:
## lm(formula = my_formula, data = pairs_od_without_intra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9990 -0.0959  0.0087  0.0894  8.7000 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -3.429e-01  3.532e-02  -9.708  < 2e-16 ***
## hwf_O                     -4.291e-04  9.263e-05  -4.633 3.61e-06 ***
## hwf_D                     -2.235e-04  9.264e-05  -2.412 0.015851 *  
## Lag_hwf_O                 -1.129e-04  1.095e-04  -1.031 0.302545    
## Lag_hwf_D                  4.109e-04  1.094e-04   3.755 0.000174 ***
## dry_O                      1.066e-03  8.131e-05  13.113  < 2e-16 ***
## dry_D                     -6.216e-04  8.138e-05  -7.638 2.24e-14 ***
## Lag_dry_O                 -1.672e-03  9.647e-05 -17.329  < 2e-16 ***
## Lag_dry_D                  6.215e-04  9.652e-05   6.439 1.21e-10 ***
## Lpop_O                     1.182e-02  1.071e-03  11.030  < 2e-16 ***
## Lpop_D                     9.983e-03  1.071e-03   9.318  < 2e-16 ***
## HDI_O                      2.453e-01  1.782e-02  13.759  < 2e-16 ***
## HDI_D                     -1.520e-02  1.800e-02  -0.844 0.398407    
## politicalstability_O       2.137e-02  4.177e-03   5.116 3.13e-07 ***
## politicalstability_D      -4.177e-04  4.182e-03  -0.100 0.920431    
## class_conflict_Omedium     2.640e-02  5.868e-03   4.499 6.85e-06 ***
## class_conflict_Ohigh       3.527e-03  8.537e-03   0.413 0.679491    
## class_conflict_Overy high  1.292e-01  1.138e-02  11.354  < 2e-16 ***
## class_conflict_Dmedium    -4.632e-02  5.864e-03  -7.899 2.87e-15 ***
## class_conflict_Dhigh      -4.264e-02  8.536e-03  -4.995 5.91e-07 ***
## class_conflict_Dvery high  3.062e-02  1.139e-02   2.688 0.007180 ** 
## log(1 + y_1)               9.680e-01  9.861e-04 981.708  < 2e-16 ***
## log(1 + g)                -1.435e-02  2.485e-03  -5.775 7.74e-09 ***
## contigyes                  1.507e-01  1.630e-02   9.247  < 2e-16 ***
## comlangyes                 3.632e-02  4.922e-03   7.379 1.62e-13 ***
## colonyyes                  8.706e-02  2.381e-02   3.656 0.000257 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3766 on 44238 degrees of freedom
## Multiple R-squared:  0.9816, Adjusted R-squared:  0.9816 
## F-statistic: 9.434e+04 on 25 and 44238 DF,  p-value: < 2.2e-16

We now reproduce the Moran scatter plot shown in Figure C.7.

mat_out <- listw2mat(nb2listw(nb_out))
# Moran scatter plot  
res_1 <- log(1 + pairs_od_without_intra$y) - 
  predict(spec_2, newdata = pairs_od_without_intra)
N <- nrow(pairs_od_without_intra)
Woy <- numeric(N)
Wdy <- numeric(N)
Wwy <- numeric(N)
require("Matrix")
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
Wo <- Matrix(0, N, N)
Wd <- Matrix(0, N, N)
Ww <- Matrix(0, N, N)

 for(k in 1:N) {
   if(k %in% c(100, 1000, 10000, 20000))
      cat("It works :", k)
   ind_o <- pairs_od_without_intra$cont_o[k]
   ind_d <- pairs_od_without_intra$cont_d[k]
   # origin 
    ind_iso_o <- which(world_iso_3$iso3 == ind_o)
    ind_neighbours_o <- world_iso_3$iso3[which(mat_out[ind_iso_o, ] > 0)]
    
  # destination 
    ind_iso_d <- which(world_iso_3$iso3 == ind_d)
    ind_neighbours_d <- world_iso_3$iso3[which(mat_out[ind_iso_d, ] > 0)]
    
    # Wo
    ind_temp <- which(pairs_od_without_intra$cont_o %in% ind_neighbours_o &
                      pairs_od_without_intra$cont_d == ind_d)
    Woy[k] <- mean(res_1[ind_temp])
    if(length(ind_temp) > 0)
      Wo[k, ind_temp] <- 1 / length(ind_temp)
    
    # Wd
    ind_temp <- which(pairs_od_without_intra$cont_o == ind_o &
      pairs_od_without_intra$cont_d %in% ind_neighbours_d)
    Wdy[k] <- mean(res_1[ind_temp])
    if(length(ind_temp) > 0)
      Wd[k, ind_temp] <- 1 / length(ind_temp)
    
    # Ww
    ind_temp <- which(pairs_od_without_intra$cont_o %in% ind_neighbours_o &
      pairs_od_without_intra$cont_d %in% ind_neighbours_d)
    Wwy[k] <- mean(res_1[ind_temp])
    if(length(ind_temp) > 0)
      Ww[k, ind_temp] <- 1 / length(ind_temp)
 }  
## It works : 100It works : 1000It works : 10000It works : 20000
#pdf("figures/moran_plot.pdf", width = 12, height = 4)
par(mfrow = c(1, 3), las = 1, mgp = c(2.3, 1, 0),
    oma = c(0, 0, 0, 0), mar = c(3.5, 4, 0.5, 0.5))
plot(res_1, Woy, xlab = "SLX residuals", ylab = "Wo x residuals",
     ylim = c(-5, 5), xlim = c(-5, 5))
abline(h = mean(Woy, na.rm = T), lty = 2, col = "lightgrey")
abline(v = 0, lty = 2, col = "lightgrey")
abline(coefficients(lm(Woy ~ res_1)), col = "red", lty = 1)

plot(res_1, Wdy, xlab = "SLX residuals", ylab = "Wd x residuals",
     ylim = c(-5, 5), xlim = c(-5, 5))
abline(h = mean(Wdy, na.rm = T), lty = 2, col = "lightgrey")
abline(v = 0, lty = 2, col = "lightgrey")
abline(coefficients(lm(Wdy ~ res_1)), col = "red", lty = 1)

plot(res_1, Wwy, xlab = "SLX residuals", ylab = "Ww x residuals",
     ylim = c(-5, 5), xlim = c(-5, 5))
abline(h = mean(Wwy, na.rm = T), lty = 2, col = "lightgrey")
abline(v = 0, lty = 2, col = "lightgrey")
abline(coefficients(lm(Wwy ~ res_1)), col = "red", lty = 1)

# dev.off()

The following code also performs a Monte Carlo permutation test to assess the significance of Moran’s \(I\) statistic.

moran.mc(res_1, mat2listw(Wo), nsim = 1000, zero.policy=T)
moran.mc(res_1, mat2listw(Wd), nsim = 1000, zero.policy=T)
moran.mc(res_1, mat2listw(Ww), nsim = 1000, zero.policy=T)
library(rpart)  #library for CART
library(rpart.plot)
library(caret)
t1 <- rpart(my_formula,
            data = pairs_od_without_intra, 
            method = "anova",         #indicates the outcome is continuous
            control = rpart.control(
                       minsplit = 1,  # min number of observ for a split 
                       minbucket = 1, # min nr of obs in terminal nodes
                       cp=0)          #decrease in complex for a split 
)

prune.t1 <- prune(t1, cp=0.00005)   #prune the tree with cp=0.02
printcp(prune.t1)

rpart.plot(prune.t1)  

2.2 SDM

The Spatial Durbin Model (SDM) extends the SLX by incorporating spatial lags of the dependent variable in addition to spatially lagged covariates.
This allows us to capture endogenous feedback effects, where flows between two countries may be influenced not only by their characteristics but also by flows in neighboring origin and destination countries.

We estimate the SDM using the spflow package, which is specifically designed for origin–destination data in non-Cartesian settings.
Unlike standard spatial econometrics packages that mainly target regional or lattice data, spflow handles flow data with separate spatial weight matrices for origins (\(\mathbf{W_o}\)), destinations (\(\mathbf{W_d}\)), and joint origin–destination neighborhoods (\(\mathbf{W_w}\)).

The estimation is performed by maximum likelihood, which jointly identifies:
- the coefficients associated with the explanatory variables, and
- the spatial autocorrelation parameters (\(\rho_o\), \(\rho_d\), \(\rho_w\)).

This framework is therefore particularly well suited for modeling international migration, where interdependencies across origins, destinations, and migration corridors play a central role.

library("spflow")
## 
## Attaching package: 'spflow'
## The following object is masked from 'package:dplyr':
## 
##     id
# nb_out_5, nb_out_10, nb_out
OW_2 <- nb2mat(nb_out_10)

#par(oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0))
#plot(st_geometry(world_iso_3))
#plot(nb_out, st_coordinates(st_centroid(world_iso_3)), add = T)

#pairs_od_without_intra <- pairs_od  %>%
#   filter(y > 0) # , CONTINENT_O == "EUROPE") #, CONTINENT_O == "AFRICA") #  & CONTINENT_O =="EUROPE")  #  & CONTINENT_O == "ASIA") # CONTINENT_O == "AMERICAS") #  filter(cont_o != cont_d)  # %>%  filter(CONTINENT_O == "AMERICA")

mig_net <- spflow_network(
  "world",
  OW,
  world_iso_3,
  "iso3")

mig_net_2 <- spflow_network(
  "world",
  OW_2,
  world_iso_3,
  "iso3")

mig_net_pairs <- spflow_network_pair(
  id_orig_net = "world",
  id_dest_net = "world",
  pair_data = pairs_od_without_intra[, c("cont_o", "cont_d", "y", "y_1", "g", "contig",
                                     "comlang", "colony", "class_conflict_O",
                                     "class_conflict_D")],
  orig_key_column = "cont_o",
  dest_key_column = "cont_d")

mig_multinet <- spflow_network_multi(mig_net, mig_net_pairs)
## Warning: The the data in the network pair world_world is reordered!
mig_multinet_2 <- spflow_network_multi(mig_net_2, mig_net_pairs)
## Warning: The the data in the network pair world_world is reordered!
# define the model that we use to explain the flows...
# ... D_() contains destination variables
# ... O_() contains origin variables
# ... D_() contains intra-regional variables (when origin == destination)
# ... P_() contains pair variables (distances)
flow_formula <-
  log(1 + y) ~
  D_(hwf + dry + Lpop + HDI + politicalstability) +
  O_(hwf + dry + Lpop + HDI + politicalstability) +
  P_(log(1 + g) + log(1 + y_1) + colony + contig + comlang + 
       class_conflict_O + class_conflict_D)

# define what variables to use in an SDM specification
# ... if not given all will be used
sdm_formula <- ~ D_(hwf + dry) + O_(hwf + dry)
# sdm_formula <- ~ D_() + O_()

# define the list of control parameters
estimation_control <- spflow_control(sdm_variables = sdm_formula,
                                     model = "model_9",
                                     use_intra = FALSE,
                                     fitted_value_method = "BPI")

# Estimate the model
spec_3 <- spflow(flow_formula, mig_multinet, 
                 estimation_control = estimation_control)
spec_4 <- spflow(flow_formula, mig_multinet_2, 
                 estimation_control = estimation_control)

Result of specification 3 (Table 3.6):

spec_3
## --------------------------------------------------
## Spatial interaction model estimated by: MLE  
## Spatial correlation structure: SDM (model_9)
## Dependent variable: log(1 + y)
## 
## --------------------------------------------------
## Coefficients:
##                                 est     sd   t.stat  p.val
## rho_d                         0.069  0.002   38.262  0.000
## rho_o                         0.066  0.002   37.344  0.000
## rho_w                        -0.061  0.002  -29.113  0.000
## (Intercept)                  -0.218  0.037   -5.963  0.000
## D_Lpop                        0.003  0.001    2.888  0.004
## D_HDI                        -0.101  0.018   -5.604  0.000
## D_politicalstability         -0.003  0.004   -0.664  0.507
## D_hwf                         0.000  0.000   -1.715  0.086
## D_hwf.lag1                    0.000  0.000    4.463  0.000
## D_dry                        -0.001  0.000   -8.556  0.000
## D_dry.lag1                    0.001  0.000    7.148  0.000
## O_Lpop                        0.004  0.001    3.640  0.000
## O_HDI                         0.149  0.018    8.382  0.000
## O_politicalstability          0.018  0.004    4.438  0.000
## O_hwf                         0.000  0.000   -3.917  0.000
## O_hwf.lag1                    0.000  0.000   -0.611  0.541
## O_dry                         0.001  0.000   11.623  0.000
## O_dry.lag1                   -0.001  0.000  -15.886  0.000
## P_log(1 + g)                  0.006  0.003    2.187  0.029
## P_log(1 + y_1)                0.917  0.002  607.412  0.000
## P_colonyyes                   0.092  0.023    3.960  0.000
## P_contigyes                   0.328  0.017   19.862  0.000
## P_comlangyes                  0.030  0.005    6.175  0.000
## P_class_conflict_Omedium      0.027  0.006    4.758  0.000
## P_class_conflict_Ohigh        0.004  0.008    0.423  0.672
## P_class_conflict_Overy high   0.112  0.011   10.042  0.000
## P_class_conflict_Dmedium     -0.040  0.006   -7.033  0.000
## P_class_conflict_Dhigh       -0.040  0.008   -4.741  0.000
## P_class_conflict_Dvery high   0.018  0.011    1.627  0.104
## 
## --------------------------------------------------
## R2_corr: 0.9835331  
## Observations: 44264  
## Model coherence: Validated

Result of specification 4 (Table 3.6):

spec_4
## --------------------------------------------------
## Spatial interaction model estimated by: MLE  
## Spatial correlation structure: SDM (model_9)
## Dependent variable: log(1 + y)
## 
## --------------------------------------------------
## Coefficients:
##                                 est     sd   t.stat  p.val
## rho_d                         0.071  0.002   34.297  0.000
## rho_o                         0.076  0.002   37.652  0.000
## rho_w                        -0.065  0.003  -23.114  0.000
## (Intercept)                  -0.228  0.040   -5.626  0.000
## D_Lpop                        0.001  0.001    1.078  0.281
## D_HDI                        -0.126  0.018   -6.893  0.000
## D_politicalstability         -0.008  0.004   -1.845  0.065
## D_hwf                         0.000  0.000   -5.187  0.000
## D_hwf.lag1                    0.001  0.000   10.802  0.000
## D_dry                         0.000  0.000   -6.942  0.000
## D_dry.lag1                    0.000  0.000    5.449  0.000
## O_Lpop                        0.004  0.001    3.372  0.001
## O_HDI                         0.119  0.018    6.620  0.000
## O_politicalstability          0.013  0.004    3.041  0.002
## O_hwf                        -0.001  0.000   -8.474  0.000
## O_hwf.lag1                    0.001  0.000    5.324  0.000
## O_dry                         0.000  0.000    6.433  0.000
## O_dry.lag1                   -0.001  0.000  -11.766  0.000
## P_log(1 + g)                  0.011  0.003    3.524  0.000
## P_log(1 + y_1)                0.918  0.002  609.347  0.000
## P_colonyyes                   0.106  0.023    4.527  0.000
## P_contigyes                   0.222  0.016   13.826  0.000
## P_comlangyes                  0.028  0.005    5.712  0.000
## P_class_conflict_Omedium      0.022  0.006    3.829  0.000
## P_class_conflict_Ohigh       -0.009  0.008   -1.043  0.297
## P_class_conflict_Overy high   0.101  0.011    8.976  0.000
## P_class_conflict_Dmedium     -0.048  0.006   -8.222  0.000
## P_class_conflict_Dhigh       -0.050  0.008   -5.914  0.000
## P_class_conflict_Dvery high   0.005  0.011    0.471  0.637
## 
## --------------------------------------------------
## R2_corr: 0.9834237  
## Observations: 44264  
## Model coherence: Validated

We compute the mean squarred error presented in Table 3.6

hat_y <- predict(spec_3)
hat_y_2 <- predict(spec_4)

mean((log(1 + pairs_od_without_intra$y) - predict(spec_1))^2)
## [1] 3.230205
mean((log(1 + pairs_od_without_intra$y) - predict(spec_2))^2)
## [1] 0.1417653
mean((log(1 + pairs_od_without_intra$y) - hat_y$FITTED)^2)
## [1] 0.1267994
mean((log(1 + pairs_od_without_intra$y) - hat_y_2$FITTED)^2)
## [1] 0.1276416

2.3 Impact decomposition

To interpret the SDM results, we compute the impact decomposition, which separates the effects of explanatory variables into origin, destination, and network components.
This approach allows us to go beyond raw parameter estimates and assess how a change in a given covariate at a specific site propagates through the migration system.

Our procedure is as follows:

  1. For each explanatory variable, we introduce a fixed increment (e.g., one additional dry day, one additional heatwave day, +0.0059 for HDI, +0.0444 for political stability, or a 1% increase in population).
  2. The increment is applied at the appropriate location (origin, destination, or lagged versions for neighbors).
  3. Using the estimated SDM, we compute the predicted flows under both the baseline and incremented scenarios.
  4. The difference between the two predictions provides the marginal impact of the increment for each bilateral flow.

By aggregating these flow-level differences, we recover local effects for each country:
- \(OE(s)\) (Origin Effects),
- \(DE(s)\) (Destination Effects),
- \(NE(s)\) (Network Effects), and
- \(TE(s)\) (Total Effects).

This decomposition makes it possible to quantify not only how local shocks (e.g., a drought in country \(s\)) affect migration from or to that country, but also how neighboring countries are indirectly affected through spatial spillovers.

nom_vars <- c("hwf", "dry", "HDI", "politicalstability", "Lpop")
res_impacts <-  data.frame(
  var = rep(nom_vars, each = length(world_iso_3$iso3)), 
  iso3 = rep(world_iso_3$iso3, times = length(nom_vars)),
  Origin = 0,
  Destination = 0,
  Network = 0,
  Total = 0)
add_unit <- c(hwf = 1, dry = 1, HDI = 0.00593, politicalstability = 0.0443764)
ind_inc <- 1
for(k in nom_vars) {
  for(ind_cnty in 1:230) {
    ind_iso <- res_impacts$iso3[ind_cnty]
    # add unit
    my_X_plus_only_ind_cnty <- st_drop_geometry(world_iso_3)
    if(k != "Lpop") {
      my_X_plus_only_ind_cnty[ind_cnty, k] <- my_X_plus_only_ind_cnty[ind_cnty, k] + add_unit[k]
    } else {
      # elasticities for Lpop
     my_X_plus_only_ind_cnty[ind_cnty, k] <- my_X_plus_only_ind_cnty[ind_cnty, k] * 1.01
    }
    my_X_plus_only_ind_cnty[, paste0("Lag_", k)] <-
      lag.listw(nb2listw(nb_out), my_X_plus_only_ind_cnty[, k])
    spec_3_change <- spec_3
    spec_3_change@spflow_matrices$D_[, k] <- my_X_plus_only_ind_cnty[, k]
    spec_3_change@spflow_matrices$O_[, k] <- my_X_plus_only_ind_cnty[, k]
    
    if(k %in% c("hwf", "dry")) {
      spec_3_change@spflow_matrices$D_[, paste0(k , ".lag1")] <-
        my_X_plus_only_ind_cnty[, paste0("Lag_", k)]
      spec_3_change@spflow_matrices$O_[, paste0(k , ".lag1")] <-
        my_X_plus_only_ind_cnty[, paste0("Lag_", k)]
    }
    
    hat_y_p <- predict(spec_3_change)
    temp_diff <- exp(hat_y_p$PREDICTION) - exp(hat_y$PREDICTION)
    OE <- sum(temp_diff[
      which(substr(pairs_od_without_intra$cont_o, 1, 3) == ind_iso)])
    DE <- sum(temp_diff[
      which(substr(pairs_od_without_intra$cont_d, 1, 3) == ind_iso)])
    TE <- sum(temp_diff)
    NE <- TE - DE - OE
  
    res_impacts[ind_inc, -c(1, 2)] <- c(OE, DE, NE, TE)
    cat(c(world_iso_3$iso3[ind_cnty], " O: ", OE, " D: ", 
          DE , " NE: ", NE, " TE: ", TE,"\n \n"))
    
  ind_inc <- ind_inc + 1
  }
}
save(res_impacts, file = "result/res_impacts.RData")

We first compute the summary impacts, reported in Table 3.7.

res_impacts %>% 
  group_by(var) %>%
  summarise(Origin = sum(Origin) / N,
            Destination = sum(Destination) / N,
            Network = sum(Network) / N,
            Total = sum(Total) / N)
## # A tibble: 5 × 5
##   var                Origin Destination  Network  Total
##   <chr>               <dbl>       <dbl>    <dbl>  <dbl>
## 1 HDI                  3.31      -2.22  -0.00691  1.08 
## 2 Lpop                 2.68       2.06   0.0209   4.76 
## 3 dry                  3.62      -2.88  -2.91    -2.17 
## 4 hwf                 -1.21      -0.593  1.46    -0.346
## 5 politicalstability   3.03      -0.476  0.00492  2.56

Then, for each decomposition component, namely OE(s), DE(s), NE(s), and TE(s), we construct a barplot displaying the ten largest local impacts.

type_E <- c("Origin", "Destination", "Network", "Total")

temp_pivot <- data.frame(country = character(),
                         nom_country = character(),
                         var = character(),
                         impacts = numeric(),
                         type = character(), 
                         TOP = character()) 
for(l in c("hwf", "dry")) {
  for(k in type_E) {
  temp_pivot <- rbind(temp_pivot,
      res_impacts %>%
        filter(var == l) %>%
        rename(impacts = k) %>%
        mutate(country = tolower(countrycode::countrycode(iso3, 
                                      "iso3c", "iso2c")),
         nom_country = countrycode::countrycode(iso3, 
                                  "iso3c", "country.name")) %>%
        arrange(-impacts) %>%
        mutate(type = k) %>% 
        select(country, nom_country, var, impacts, type) %>%
        filter(impacts > quantile(impacts, 0.9) | 
             impacts < quantile(impacts, 0.1)) %>%
        mutate(TOP = c(rep("largest", 23), rep("lowest", 23)))
)
  }
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(k)
## 
##   # Now:
##   data %>% select(all_of(k))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# impacts Dry Total
choice_var <- "dry"
choice_var <- "hwf"


for(choice_var in c("hwf", "dry")) {
  for(l in 1:4) {
df_plot <- res_impacts %>%
  filter(var == choice_var) %>%
  rename(impacts = l + 2) %>%
  mutate(
    country     = tolower(countrycode(iso3, "iso3c", "iso2c")),
    nom_country = iso3 # countrycode(iso3, "iso3c", "country.name")
  )

# top et bottom
df_top_bottom <- bind_rows(
  slice_max(df_plot, impacts, n = 10),
  slice_min(df_plot, impacts, n = 10)
) %>%
  arrange(impacts) %>%   # ordonner numériquement
  mutate(
    nom_country = factor(nom_country, levels = nom_country) # fixer l'ordre
  )

# insérer "..." au milieu
mid_row <- tibble(
  nom_country = "...",
  impacts = NA_real_,
  country = NA_character_
)

# combiner et refixer l'ordre
df_final <- bind_rows(
  slice_head(df_top_bottom, n = 10),
  mid_row,
  slice_tail(df_top_bottom, n = 10)
) %>%
  mutate(nom_country = factor(nom_country, levels = nom_country))

# plot
ggplot(df_final, aes(x = nom_country, y = impacts)) +
  geom_col(na.rm = TRUE) +
  geom_flag(aes(y = 0, country = country), na.rm = TRUE) +
  coord_flip() +
  theme_minimal() +
  xlab("") + ylab("") +
  ggtitle(paste0(type_E[l], " Effect (", choice_var, ")"))

#ggsave(paste0(type_E[l], "_impacts_", choice_var, ".pdf"), 
#       width = 5, height = 6)
  }
}

Map the effects:

final_indicator_b <- merge(final_indicator, res_impacts %>% 
  filter(var == "hwf") %>%
  select(-var) %>%
  rename(hwf_Origin = Origin,
         hwf_Destination = Destination,
         hwf_Network = Network,
         hwf_Total = Total), by = "iso3")
final_indicator_b <- merge(final_indicator_b, res_impacts %>% 
  filter(var == "dry") %>%
  select(-var) %>%
  rename(dry_Origin = Origin,
         dry_Destination = Destination,
         dry_Network = Network,
         dry_Total = Total), by = "iso3")

final_indicator_b <- st_transform(final_indicator_b, st_crs(sea))
for(s in c("dry", "hwf")) {
#  pdf(paste0("figures/map_", s, "_impact.pdf"), 
#      width = 12 * 1.7, height = 6.7 * 1.6)      
  par(mar = c(0, 1, 1, 0), mfrow = c(2, 2))

  for(l in c("Total", "Origin", "Destination", "Network")) {
    vec_temp <- st_drop_geometry(final_indicator_b)[, paste0(s, "_", l)]
  ##############
    if(l %in% c("Total", "Network")) {
      my_mid <- 0
      bk_1 <- round(classInt::classIntervals(vec_temp[vec_temp < my_mid], 4,
                                           "kmeans")$brks,  digits = 6)
      bk_2 <- round(classInt::classIntervals(vec_temp[vec_temp >= my_mid], 4,
                                           "kmeans")$brks,  digits = 6)
      bk <- c(bk_1[1:4], my_mid, bk_2[2:5]) 
      bk[1] <- min(vec_temp)
      bk[length(bk)] <- max(vec_temp)
      pal1 <- rev(RColorBrewer::brewer.pal(8, "RdBu"))  
      } else {
        ##############
        bk <- round(classInt::classIntervals(vec_temp, 9, "kmeans")$brks, digits = 6)
        bk[1] <- min(vec_temp)
        bk[length(bk)] <- max(vec_temp)
        if(s == "hwf")
          pal1 <- rev(RColorBrewer::brewer.pal(9, "Blues"))
        else {
          if(l == "Origin")
            pal1 <- RColorBrewer::brewer.pal(9, "Reds")
          else
            pal1 <- rev(RColorBrewer::brewer.pal(9, "Blues"))
        }
      }
    ###############
      
    ind <- findInterval(vec_temp, bk, all.inside = TRUE)
    plot(st_geometry(final_indicator_b), col = pal1[ind], 
         border = "lightgrey", pch = 15, cex = 0.5, lwd = 0.1)
    # title("Consecutive Dry Day in 2024", line = -1.15)
    plot(sea, col = scales::alpha("lightblue", 0.3), 
         border = rgb(0.2, 0.2, 0.2), add = T)
    
    plot(st_geometry(world_union), add = T, border = rgb(0.2, 0.2, 0.2))
    plot(long_sf, add = T, col = "lightgrey")
    plot(lat_sf, add = T, col = "lightgrey")

## parameters for legend
    poly_leg_temp <- vector("list", length(bk) - 1)
    for(k in 1:(length(bk) - 1)) {
      poly_leg_temp[[k]] <- rbind(c(-10000000 + (k - 1) * 3000000, 9200000), 
                              c(-10000000 + 1000000 + (k - 1) * 3000000, 9200000),
                              c(-10000000 + 1000000 + (k - 1) * 3000000, 8900000),
                              c(-10000000 + (k - 1) * 3000000, 8900000),
                              c(-10000000 + (k - 1) * 3000000, 9200000))
    }
    poly_leg <- st_sfc(st_polygon(poly_leg_temp[1]))
    for(k in 2:length(poly_leg_temp)) {
      poly_leg <- c(poly_leg, st_sfc(st_polygon(poly_leg_temp[k])))
    }
    # legend
    plot(poly_leg, col = pal1, add = T)
    temp_coord <- st_coordinates(st_centroid(poly_leg))
    bk_round <- round(bk, 1)
    for(k in 1:length(poly_leg)) {
      if(k == 1) {
        my_text <- paste0("<", bk_round[2])
        } else {
          if(k == (length(bk) - 1)) {
            my_text <- paste0(">=", bk_round[length(bk) - 1])
            } else {
              my_text <- paste0("[", bk_round[k], "; ", bk_round[k+1], "[")
            }
        }
      text(temp_coord[k, 1], temp_coord[k, 2] + 10, my_text, pos = 3, cex = 0.7)
    }
    title(paste0(l, " impacts"), line = -1)
  }
#   dev.off()
}